In [21]:
import numpy as np
import pandas as pd
train_data = pd.read_csv('./titanic/train.csv')
test_data = pd.read_csv('./titanic/test.csv')
In [30]:
train_data['Age'].astype(np.float64)
train_data['Gender'] = train_data['Sex'].map({'female':0, 'male':1}).astype(int)
median_ages = np.zeros((2,3))
for i in range(2):
for j in range(3):
median_ages[i,j] = train_data[
(train_data['Gender']==i)&(train_data['Pclass']==j+1)
]['Age'].dropna().median()
train_data['AgeFill'] = train_data['Age']
for i in range(2):
for j in range(3):
train_data.loc[
(train_data.Age.isnull())&(train_data['Gender']==i)&(train_data['Pclass']==j+1),'AgeFill'
] = median_ages[i,j]
train_data['AgeIsNull'] = pd.isnull(train_data.Age).astype(int)
In [31]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
train_data['Age*class'] = train_data['AgeFill'] * train_data['Pclass']
In [37]:
train_data = train_data.drop(['PassengerId','Name','Sex','Ticket','Cabin','Embarked'],axis=1)
train_data = train_data.drop(['Age'],axis=1)
In [33]:
test_data['Gender'] = test_data['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
median_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
median_ages[i,j] = test_data[(test_data['Gender'] == i) & \
(test_data['Pclass'] == j+1)]['Age'].dropna().median()
test_data['AgeFill'] = test_data['Age']
for i in range(0, 2):
for j in range(0, 3):
test_data.loc[ (test_data.Age.isnull()) & (test_data.Gender == i) & (test_data.Pclass == j+1),\
'AgeFill'] = median_ages[i,j]
test_data['AgeIsNull'] = pd.isnull(test_data.Age).astype(int)
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']
test_data['Age*Class'] = test_data.AgeFill * test_data.Pclass
test_data = test_data.drop(['PassengerId','Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)
test_data = test_data.drop(['Age'], axis=1)
faremedian = test_data['Fare'].dropna().median()
test_data.loc[test_data.Fare.isnull(),'Fare'] = faremedian
In [44]:
X_train = train_data.values
X_train, y_train = X_train[:,1:], X_train[:,0]
X_test = test_data.values
In [47]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
y_pred = forest.predict(X_test)
In [80]:
result = pd.read_csv('./titanic/gender_submission.csv')
result['Survived'] = y_pred.astype(int)
result.to_csv('./titanic/submissions.csv',index=False)
In [ ]: